library(readr)
library(knitr)
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
housing_data = read_csv("housing.csv")#first line of housing 
## Parsed with column specification:
## cols(
##   longitude = col_double(),
##   latitude = col_double(),
##   housing_median_age = col_double(),
##   total_rooms = col_double(),
##   total_bedrooms = col_double(),
##   population = col_double(),
##   households = col_double(),
##   median_income = col_double(),
##   median_house_value = col_double(),
##   ocean_proximity = col_character()
## )
housing_data$median_house_value[1:100]
##   [1] 452600 358500 352100 341300 342200 269700 299200 241400 226700 261100
##  [11] 281500 241800 213500 191300 159200 140000 152500 155500 158700 162900
##  [21] 147500 159800 113900  99700 132600 107500  93800 105500 108900 132000
##  [31] 122300 115200 110400 104900 109700  97200 104500 103900 191400 176000
##  [41] 155400 150000 118800 188800 184400 182300 142500 137500 187500 112500
##  [51] 171900  93800  97500 104200  87500  83100  87500  85300  80300  60000
##  [61]  75700  75000  86100  76100  73500  78400  84400  81300  85000 129200
##  [71]  82500  95200  75000  67500 137500 177500 102100 108300 112500 131300
##  [81] 162500 112500 112500 137500 118800  98200 118800 162500 137500 500001
##  [91] 162500 137500 162500 187500 179200 130000 183800 125000 170000 193100
summary(housing_data)#gives us a summary of each column. Note that total bedrooms has 207 NA's. We will need to impute these values
##    longitude         latitude     housing_median_age  total_rooms   
##  Min.   :-124.3   Min.   :32.54   Min.   : 1.00      Min.   :    2  
##  1st Qu.:-121.8   1st Qu.:33.93   1st Qu.:18.00      1st Qu.: 1448  
##  Median :-118.5   Median :34.26   Median :29.00      Median : 2127  
##  Mean   :-119.6   Mean   :35.63   Mean   :28.64      Mean   : 2636  
##  3rd Qu.:-118.0   3rd Qu.:37.71   3rd Qu.:37.00      3rd Qu.: 3148  
##  Max.   :-114.3   Max.   :41.95   Max.   :52.00      Max.   :39320  
##                                                                     
##  total_bedrooms     population      households     median_income    
##  Min.   :   1.0   Min.   :    3   Min.   :   1.0   Min.   : 0.4999  
##  1st Qu.: 296.0   1st Qu.:  787   1st Qu.: 280.0   1st Qu.: 2.5634  
##  Median : 435.0   Median : 1166   Median : 409.0   Median : 3.5348  
##  Mean   : 537.9   Mean   : 1425   Mean   : 499.5   Mean   : 3.8707  
##  3rd Qu.: 647.0   3rd Qu.: 1725   3rd Qu.: 605.0   3rd Qu.: 4.7432  
##  Max.   :6445.0   Max.   :35682   Max.   :6082.0   Max.   :15.0001  
##  NA's   :207                                                        
##  median_house_value ocean_proximity   
##  Min.   : 14999     Length:20640      
##  1st Qu.:119600     Class :character  
##  Median :179700     Mode  :character  
##  Mean   :206856                       
##  3rd Qu.:264725                       
##  Max.   :500001                       
## 
library(ggplot2)
#we want to look at shape of distribution to get a good idea of what to impute
ggplot(housing_data, aes(x = total_bedrooms)) +
  geom_histogram(bins = 40) +
  xlab("Total Bedrooms") +
  ylab("Density") +
  ggtitle("Histogram of Total Bedrooms (noncontinuous variable)")
## Warning: Removed 207 rows containing non-finite values (stat_bin).

#using mean for now
library(mice)
## Warning: package 'mice' was built under R version 3.5.3
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
housing_data_temp = mice(data = housing_data, m = 5, method = "mean", seed = 420)
## 
##  iter imp variable
##   1   1  total_bedrooms
##   1   2  total_bedrooms
##   1   3  total_bedrooms
##   1   4  total_bedrooms
##   1   5  total_bedrooms
##   2   1  total_bedrooms
##   2   2  total_bedrooms
##   2   3  total_bedrooms
##   2   4  total_bedrooms
##   2   5  total_bedrooms
##   3   1  total_bedrooms
##   3   2  total_bedrooms
##   3   3  total_bedrooms
##   3   4  total_bedrooms
##   3   5  total_bedrooms
##   4   1  total_bedrooms
##   4   2  total_bedrooms
##   4   3  total_bedrooms
##   4   4  total_bedrooms
##   4   5  total_bedrooms
##   5   1  total_bedrooms
##   5   2  total_bedrooms
##   5   3  total_bedrooms
##   5   4  total_bedrooms
##   5   5  total_bedrooms
## Warning: Number of logged events: 1
housing_data_full  = complete(housing_data_temp, 1)
housing_data_nc = housing_data_full[, -10]#remove text variable for now

corrmatrix = cor(housing_data_nc)

kable(t(corrmatrix))
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
longitude 1.0000000 -0.9246644 -0.1081968 0.0445680 0.0692597 0.0997732 0.0553101 -0.0151759 -0.0459666
latitude -0.9246644 1.0000000 0.0111727 -0.0360996 -0.0666584 -0.1087847 -0.0710354 -0.0798091 -0.1441603
housing_median_age -0.1081968 0.0111727 1.0000000 -0.3612622 -0.3189983 -0.2962442 -0.3029160 -0.1190340 0.1056234
total_rooms 0.0445680 -0.0360996 -0.3612622 1.0000000 0.9272527 0.8571260 0.9184845 0.1980496 0.1341531
total_bedrooms 0.0692597 -0.0666584 -0.3189983 0.9272527 1.0000000 0.8739095 0.9747249 -0.0076819 0.0494535
population 0.0997732 -0.1087847 -0.2962442 0.8571260 0.8739095 1.0000000 0.9072223 0.0048343 -0.0246497
households 0.0553101 -0.0710354 -0.3029160 0.9184845 0.9747249 0.9072223 1.0000000 0.0130331 0.0658427
median_income -0.0151759 -0.0798091 -0.1190340 0.1980496 -0.0076819 0.0048343 0.0130331 1.0000000 0.6880752
median_house_value -0.0459666 -0.1441603 0.1056234 0.1341531 0.0494535 -0.0246497 0.0658427 0.6880752 1.0000000
highcorr = findCorrelation(corrmatrix, cutoff = .60)#this will give you highly correlated variables
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(RColorBrewer)
library(plotly)
## Warning: package 'plotly' was built under R version 3.5.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_map = ggplot(housing_data_full, 
                  aes(x = longitude, y = latitude, color = median_house_value, hma = housing_median_age,
                      tr = total_rooms, tb = total_bedrooms, hh = households, mi = median_income)) +
              geom_point(aes(size = population), alpha = 0.4) +
              xlab("Longitude") +
              ylab("Latitude") +
              ggtitle("Data Map - Longtitude vs Latitude and Associated Variables") +
              theme(plot.title = element_text(hjust = 0.5)) +
              scale_color_distiller(palette = "Paired", labels = comma) +
              labs(color = "Median House Value (in $USD)", size = "Population")
plot_map_tt = ggplotly(plot_map)

plot_map_tt